set.seed(2)

required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "ggridges", "spatstat", "broom")
for(i in required_packages) { 
  if(!require(i, character.only = T)) {
    #  if package is not existing, install then load the package
    install.packages(i, dependencies = T)
  require(i, character.only = T)
  }
}

panderOptions('table.alignment.default', "left")

## quality of png's
dpi <- 750

## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15))


theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void() +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)

Read data

df <- bind_rows(
    pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_100_to_500/{.x}.csv"))) %>% 
        clean_names() %>% mutate(cat = "100-500"),
    pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_500_to_1000/{.x}.csv"))) %>% 
        clean_names() %>% mutate(cat = "500-1000"),
    pmap_df(list(0:49), ~read_csv(glue("../proc_data/ahref/export_keywords_1000_to_10000/{.x}.csv"))) %>% 
        clean_names() %>% mutate(cat = "1000-10000"),
    pmap_df(list(0:33), ~read_csv(glue("../proc_data/ahref/export_keywords_10000_to_1000000000/{.x}.csv"))) %>% 
            clean_names() %>% mutate(cat = "10000+")
)


Preamble


The volume looks weird. It doesn’t follow the initial categories:

df %>% drop_na(volume) %>% 
    group_by(cat) %>% 
    summarise(
        n = n(),
        mean = mean(volume),
        median = median(volume),
        max = max(volume),
        min = min(volume)
        ) %>% 
    arrange(median) %>% 
    pander()
cat n mean median max min
100-500 714519 291.3 100 7330000 10
500-1000 776898 407.7 200 5750000 10
1000-10000 402317 1064 300 5310000 10
10000+ 272614 13835 800 45260000 10

Let’s look at some of the searches that had a low search volume in the initial data set:

df %>% filter(cat == "100-500", volume > 10000) %>% 
    select(keyword, volume) %>% 
    head(5) %>% 
    pander()
keyword volume
new orleans 336000
beauty 135000
mine 84000
stupid 83000
newport news 32000

This doesn’t looks like low volume words. And indeed they were not in the original data sets for 100-500 volume. I’m not sure how they came in.

This removes those keywords that were not in the same category in the data samples I created:

df_orig <- bind_rows(
    pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_100_to_500/{.x}.txt"), col_names = c("keyword"))) %>% 
        clean_names() %>% select(keyword) %>% mutate(cat = "100-500"),
    pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_500_to_1000/{.x}.txt"), col_names = c("keyword"))) %>% 
        clean_names() %>% mutate(cat = "500-1000"),
    pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_1000_to_10000/{.x}.txt"), col_names = c("keyword"))) %>% 
        clean_names() %>% mutate(cat = "1000-10000"),
    pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_10000_to_1000000000/{.x}.txt"), col_names = c("keyword"))) %>% 
            clean_names() %>% mutate(cat = "10000+")
)
df %<>% inner_join(df_orig, by = c("cat", "keyword"))

There are still ~2.5 million rows left:

tibble("Number of rows" = format(df %>% nrow(), big.mark = ",")) %>% 
  pander()
Number of rows
2,494,536

Another issue is that of the representativeness of the samples. Keywords with less than 100 volume are not even represented. And the othe rvolumes are represented at skewed ratios. This makes a large difference, especially if we look at stats based on search instead of based on volume.

To me, it still makes most sense to look at it based on volume. But you’ve been quite clear that this is not what we want. Still, it seems wrong to me to use this unrepresentative dataset, and report things such as mean. That will fully depend on how we happened to create the samples. Since most of searches have low volume, removing those with volume below 100 makes a huge difference.

So, for analyses where this is important, I will perform them in three different ways.

  1. Using the samples here directly

  2. Using the samples here, but scaled so that they are representative of the original data set. (Except that < 100 volume is removed.)

  3. Using the scaled samples, and also go by volume instead of count.

As we can see, these three approaches give quite different results.

length_keyword_files <- function(min, max){
  sql <- glue("SELECT count(*) as `count`
          FROM `dataforseo-bigquery.dataforseo_data.keyword_data` 
          WHERE location = 2840 
          AND keyword_info_search_volume >= {min}
          AND keyword_info_search_volume < {max}")
  tb <- bq_project_query("dataforseo-bigquery", sql)
  df <- bq_table_download(tb) %>% mutate(min = min, max = max)
}

scaling <- map2_df(c(0, 100, 500, 1000, 10000), c(100, 500, 1000, 10000, 1000000000), length_keyword_files) %>% 
    mutate(factor = count / 1000000) %>% relocate(min, max)

scaling %>% pander()
min max count factor
0 100 531523126 531.5
100 500 33059825 33.06
500 1000 5766061 5.766
1000 10000 8449417 8.449
10000 1e+09 1284194 1.284


df %<>% mutate(
  difficulty_cat = case_when(
    difficulty <= 10 ~ "Easy\n(0-10)",
    between(difficulty, 11, 30) ~ "Medium\n(11-30)",
    between(difficulty, 31, 70) ~ "Hard\n(31-70)",
    between(difficulty, 71, 100) ~ "Super hard\n(71-100)"
  )) %>% 
  mutate(difficulty_cat = factor(difficulty_cat, levels = c("Easy\n(0-10)", "Medium\n(11-30)", "Hard\n(31-70)", "Super hard\n(71-100)"))) %>% 
  mutate(log_volume = log10(volume))

dfs <- df %>% sample_n(20000) 

rdf <- bind_rows(
  df %>% filter(between(volume, 100, 500)) %>% 
    sample_n(scaling %>% filter(min == 100) %>% pull(factor) * 2000),
  df %>% filter(between(volume, 100, 500)) %>% 
    sample_n(scaling %>% filter(min == 500) %>% pull(factor) * 2000),
  df %>% filter(between(volume, 1000, 10000)) %>% 
    sample_n(scaling %>% filter(min == 1000) %>% pull(factor) * 2000),
  df %>% filter(between(volume, 10000, 1000000000)) %>% 
    sample_n(scaling %>% filter(min == 10000) %>% pull(factor) * 2000)
)

Keyword difficulty

Mean and median of sample:

tribble(~Mean, ~Median,
        round(mean(df$difficulty, na.rm = T), 2), median(df$difficulty, na.rm = T)) %>% 
  pander()
Mean Median
14.28 6

Mean and median of representative sample:

tribble(~Mean, ~Median,
        round(mean(rdf$difficulty, na.rm = T), 2), median(rdf$difficulty, na.rm = T)) %>% 
  pander()
Mean Median
14.5 6

Mean and median of sample by volume

tribble(~Mean, ~Median,
        round(weighted.mean(rdf$difficulty, rdf$volume, na.rm = T), 2), weighted.median(rdf$difficulty, rdf$volume)) %>% 
  pander()
Mean Median
37.19 33
dfs %>%
  ggplot(aes(x = volume, y = difficulty)) +
  geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
  scale_x_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  labs(title = "Keyword difficulty and volume")

For each doubling of volume, the difficulty increases by 1.63:

lm1 <- lm(difficulty ~ log2(volume), dfs)
lm1 %>% summary()
## 
## Call:
## lm(formula = difficulty ~ log2(volume), data = dfs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.405 -12.575  -7.412   6.201  87.562 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.83205    0.50911   3.598 0.000321 ***
## log2(volume)  1.62619    0.06295  25.834  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.45 on 16063 degrees of freedom
##   (3935 observations deleted due to missingness)
## Multiple R-squared:  0.03989,    Adjusted R-squared:  0.03983 
## F-statistic: 667.4 on 1 and 16063 DF,  p-value: < 2.2e-16
dfs %>% drop_na(difficulty_cat) %>% 
  ggplot(aes(y = volume, x = difficulty_cat)) +
  geom_violin(draw_quantiles = c(0.5)) +
  scale_y_log10(labels = comma) +
  labs(x = "Difficulty category", title = "Keyword difficulty and volume")

People will most likely not understand a violin chart. Alternatives? box blot?

!!!J: In my experience people understand violin charts equally well as box charts, and I have shown them quite a few times. It is almost a box chart, with a median, and some area above and below. So I think we can keep it. If you insist, it’s an easy change to make box charts instead. Although I do think they are not really needed here, since the scatter plots are good.

dfs %>%
  ggplot(aes(x = difficulty, y = cpc)) +
  geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
  scale_y_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  labs(title = "Keyword difficulty and cpc")

dfs %>% drop_na(difficulty_cat) %>% 
  ggplot(aes(y = cpc, x = difficulty_cat)) +
  geom_violin(draw_quantiles = c(0.5)) +
  scale_y_log10(labels = comma) +
  labs(x = "Difficulty category", title = "Keyword difficulty and cpc")

!!!D: similar comments as above.

!!!D: Shame we cannot use the keyword categories here. May be we can try to solve this after the 3rd of November if time remains. Just curios how large the df would be if we select just the columns keyword and info_categories from the original data set. We could do on the google big query page to avoid that rstudio crashes (plus, apply some filters). Just curios to know: You probably have thought of running a left join but why is that not possible?

!!!J: How would you do that exactly in practice? Like, what do I left join on, concretely? I know, the keywords from here. But how do I get that list into the database or the SQL command? Not saying it’s not possible, Im just not sure how to do it.


SERP features

Note there are (at least) two additional SERP feature types, knowledge panel and videos, for which the sample size is too small to be included.

dff <- dfs %>% 
  select(keyword, volume, clicks, cpc, serp_features, cps) %>% 
  separate_rows(serp_features, sep = ",") %>% 
  mutate(serp_features = ifelse(is.na(serp_features), "(None)", serp_features)) %>% 
  filter(!(serp_features %in% c("Videos", "Knowledge panel")))

nones <- dff %>% filter(serp_features == "(None)")

dffn <- dff %>% group_by(keyword) %>% 
  summarise(n_serp = n()) %>% 
  mutate(n_serp = ifelse(keyword %in% nones$keyword, 0, n_serp)) %>% 
  mutate(n_serp = ifelse(n_serp >= 6, "6+", as.character(n_serp))) %>% 
  mutate(n_serp = factor(n_serp, levels = c("0", "1", "2", "3", "4", "5", "6+")))

dffn <- left_join(dffn, dfs, by = "keyword")

In sample:

dff %>% group_by(serp_features) %>% 
  summarise(prop = n() / nrow(dfs)) %>% 
  ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  scale_x_continuous(labels = scales::percent) +
  labs(y = "SERP feature", x = "", title = "Presence of SERP features")

rdff <- rdf %>% 
  select(keyword, volume, clicks, cpc, serp_features, cps) %>% 
  separate_rows(serp_features, sep = ",") %>% 
  mutate(serp_features = ifelse(is.na(serp_features), "(None)", serp_features)) %>% 
  filter(!(serp_features %in% c("Videos", "Knowledge panel")))

nones <- rdff %>% filter(serp_features == "(None)")

rdffn <- rdff %>% group_by(keyword) %>% 
  summarise(n_serp = n()) %>% 
  mutate(n_serp = ifelse(keyword %in% nones$keyword, 0, n_serp)) %>% 
  mutate(n_serp = ifelse(n_serp >= 6, "6+", as.character(n_serp))) %>% 
  mutate(n_serp = factor(n_serp, levels = c("0", "1", "2", "3", "4", "5", "6+")))

rdffn <- left_join(dffn %>% select(keyword, n_serp), dfs, by = "keyword")

Representative:

rdff %>% group_by(serp_features) %>% 
  summarise(prop = n() / nrow(rdff)) %>% 
  ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  scale_x_continuous(labels = scales::percent) +
  labs(y = "SERP feature", x = "", title = "Presence of SERP features")

By volume:

rdff %>% group_by(serp_features) %>% 
  summarise(prop = sum(volume) / sum(rdff$volume)) %>% 
  ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  scale_x_continuous(labels = scales::percent) +
  labs(y = "SERP feature", x = "", title = "Presence of SERP features")

In sample:

dffn %>% group_by(n_serp) %>%
  summarise(n = n() / nrow(dffn)) %>% 
  ggplot(aes(x = n_serp, y = n)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  labs(x = "Number of serp features", y = "", title = "Distribution of SERP features") +
  scale_y_continuous(labels = scales::percent)

By volume:

rdffn %>% group_by(n_serp) %>%
  summarise(prop = sum(volume) / sum(rdff$volume)) %>% 
  ggplot(aes(x = n_serp, y = prop)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  labs(x = "Number of serp features", y = "", title = "Distribution of SERP features") +
  scale_y_continuous(labels = scales::percent)

The knowledge card has a huge effect in reducing the cps, while the other SERP features have limited effect. Searches with the Shopping results SERP feature have higher cps on average.

order <- dff %>% 
  group_by(serp_features) %>%
  summarise(mean_cps = mean(cps, na.rm = T)) %>% 
  arrange(mean_cps) %>% 
  pull(serp_features)

  
dff %>% 
  mutate(serp_features = factor(serp_features, levels = order)) %>% 
  ggplot(aes(y = serp_features, x = cps)) +
  stat_density_ridges(fill = "turquoise4", color = "black") +
  labs(y = "SERP feature", title = "SERP features and cps")

order <- dff %>% 
  group_by(serp_features) %>%
  summarise(mean_volume = mean(volume, na.rm = T)) %>% 
  arrange(mean_volume) %>% 
  pull(serp_features)

  
dff %>% 
  mutate(serp_features = factor(serp_features, levels = order)) %>% 
  ggplot(aes(y = serp_features, x = volume)) +
  scale_x_log10(labels = comma) +
  stat_density_ridges(fill = "turquoise4", color = "black") +
  labs(y = "SERP feature", title = "SERP features and cps")

Low difficulty keywords have fewer SERP features

rdffn %>% mutate(n_serp = as.numeric(n_serp)) %>% 
  drop_na(difficulty_cat) %>% 
  ggplot(aes(x = difficulty_cat, y = n_serp)) +
  geom_boxplot() +
  labs(x = "Difficulty", y = "SERP features", title = "Difficulty and number of SERP features")

rdffn %>% group_by(n_serp) %>%
  summarise(difficulty = mean(difficulty, na.rm = T)) %>% 
  ggplot(aes(x = n_serp, y = difficulty)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
  labs(x = "SERP features", title = "Number of SERP features and mean difficulty", y = "Difficulty")

find_pairs <-  function(){
  rs <- rdff %>% filter(serp_features != "(None)") %>% 
    sample_n(20000)
  kw <- rs %>% distinct(keyword) %>% pull(keyword)
  
  get_table <- function(k){
    a <- rs %>% filter(keyword == k, serp_features != "(None)")
    crossing(v1 = a$serp_features, v2= a$serp_features) %>% filter(v1 < v2)
  }
  
  pairs <- map_dfr(kw, get_table) %>% 
    group_by(v1, v2) %>% 
    summarise(n = n())
  
  pairs %>% write_csv("../proc_data/serp_pairs.csv")
}

pairs <- read_csv("../proc_data/serp_pairs.csv") %>% 
  mutate(n = n / sum(n))

pairs %>% 
  arrange(desc(n)) %>%
  head(10) %>% 
  mutate(pair = glue("{v1} & {v2}")) %>% 
  ggplot(aes(y = reorder(pair, n), x = n)) +
  geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
  labs(x = "", y = "SERP feature pair", title = "Most common SERP feature pairings") +
  scale_x_continuous(labels = scales::percent)

Definition SERP feautres: A SERP feature is any result on a Google Search Engine Results Page (SERP) that is not a traditional organic result. The most common SERP Features are: Rich Snippets which add a visual layer to an existing result (e.g., review stars for product ratings).

!!!D: after the 3rd, may make sense to check if there is anything interesting on relationship between keyword category and SERP features.

Some definitions:

The Clicks column shows exactly how many times per month people tend to click any pages when googling this keyword. Some searches result in a lot of clicks, while other high search volume keywords may not bring in as much traffic from search due to the low number of clicks.

The keyword “chauffeur” has a high search volume of 67,000 searches per month. Yet that volume only resulted in 13,406 clicks. One probable reason could be that Google already gave what people wanted instantly - and there was no need to click on the search results.

!!!D: CPS column definition

The CPS (Click per Search) shows an average number of clicks for all searches. It is basically a correlation between the Clicks metric and the Search Volume of the keyword.

In the example given, people search for “wow chauffeur” less frequently than “chauffeur”, yet the keyword has more Clicks than Searches.

Further investigation reveals that the word “wow” actually stands for “World of Warcraft”, and apparently, people are looking for information on how to summon a “chauffeur” in the game. That makes for a completely different search intent.

And this is why we have the CPS metric.

The higher the CPS (i.e people clicking on a few links to satisfy their search query) – the more chances that you’ll get some traffic even if you’re not ranking #1 for that search query.

More info: https://help.ahrefs.com/en/articles/624151-what-does-clicks-stand-for-in-keywords-explorer


Global volume

dfv <- 
  bind_rows(
    df %>% mutate(region = "US"),
    df %>% mutate(volume = global_volume - volume, region = "International")
    )

In ahref, international volume is higher:

dfv %>% group_by(region) %>% 
  summarise(volume = sum(volume, na.rm = T)) %>% 
  mutate(volume = scales::percent(volume / sum(volume))) %>% 
  pander()
region volume
International 67%
US 33%

However, in the original data set, US volume is much higher:

tribble(~region, ~volume,
        "US", "82%",
        "International", "19%")
## # A tibble: 2 x 2
##   region        volume
##   <chr>         <chr> 
## 1 US            82%   
## 2 International 19%

I will be going by ahref in the following.

Internationally there are more searches with very low volume, while US has more searches with medium volume.

dfv %>% drop_na(volume) %>% 
  mutate(volume_group = case_when(volume < 100 ~ "< 100",
    between(volume, 100, 1000) ~ "100 - 1000",
    between(volume, 1000, 10000) ~ "1000 - 10,000",
    volume > 10000 ~ "10,000 +")) %>% 
  mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 +"))) %>% 
  group_by(volume_group, region) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  mutate(n = n / sum(n)) %>% 
  ggplot(aes(x = volume_group, y = n, fill = region)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.8, color = "black") +
  labs(fill = "Region", x = "Volume", y = "") +
  scale_y_continuous(labels = scales::percent, expand = c(0,0))

There is not a large difference in the number of searches with very high volume. However, the total volume of these searches is a lot higher internationally

dfv %>% drop_na(volume) %>% 
  mutate(volume_group = case_when(volume < 100 ~ "< 100",
    between(volume, 100, 1000) ~ "100 - 1000",
    between(volume, 1000, 10000) ~ "1000 - 10,000",
    between(volume, 10000, 100000) ~ "10,000 - 100,000",
    between(volume, 100000, 1000000) ~ "100,000 - 1M",
    volume > 1000000 ~ "1M +")) %>% 
  mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 - 100,000", "100,000 - 1M", "1M +"))) %>% 
  group_by(volume_group, region) %>% 
  summarise(n = sum(volume)) %>%
  ungroup() %>% 
  mutate(n = n / sum(n)) %>% 
  ggplot(aes(x = volume_group, y = n, fill = region)) +
  geom_bar(stat = "identity", position = position_dodge(), width = 0.8, color = "black") +
  labs(fill = "Region", x = "Volume", y = "") +
  scale_y_continuous(labels = comma, expand = c(0,0))

df_int <- df %>% mutate(international_volume = global_volume - volume) %>% 
  filter(international_volume > 0, global_volume > 0) %>% 
  mutate(volume_diff = log10(international_volume) - log10(volume))

df_int_s <- df_int %>% sample_n(200000)

df_int_s %>% 
  ggplot(aes(x = volume, y = international_volume)) +
  geom_jitter(size = 0.05, alpha = 0.02, height = 0.15, width = 0.15) +
  scale_x_log10(labels = comma, expand = c(0,0)) +
  scale_y_log10(labels = comma, expand = c(0,0)) +
  geom_abline(intercept = 0, slope = 1, color = "turquoise4", size = 1) +
  labs(x = "US volume", y = "International volume")


We can see that they mostly follow each other, but there are some searches with large difference between them.

Higher volume internationally:

tbl <- df_int %>% 
  arrange(desc(volume_diff)) %>% 
  filter(volume != 60) %>% 
  select(keyword, us_volume = volume, international_volume) %>% 
  head(5)

tbl %>% write_csv("../plots/csv/table_int.csv")
tbl %>% pander()
keyword us_volume international_volume
filmoviplex 10 295990
cloroquina 200 5869800
parivahan sewa 10 276990
jokaroom 10 173990
handball em 20 327980


Higher volume in US:

tbl <- df_int %>% 
  arrange(volume_diff) %>% 
  select(keyword, us_volume = volume, international_volume) %>% 
  head(5)

tbl %>% write_csv("../plots/csv/table_us.csv")
tbl %>% pander()
keyword us_volume international_volume
football playoff schedule 602000 1000
frontier mail 586000 1000
spectrum mobile 526000 1000
chase bank near me 523000 1000
spectrum internet 998000 2000


Searches that have higher volume in US have a higher click-per-search on average than searches that have higher volume internationally.

df_int_s %>% 
  filter(cps < 5) %>% 
  ggplot(aes(x = volume_diff, y = cps)) +
  geom_point(alpha = 0.02, size = 0.02) +
  geom_smooth(method='lm', formula= y~x) +
  scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
  labs(x = "")


They also have a higher cost-per-click on average

df_int_s %>% 
  ggplot(aes(x = volume_diff, y = cpc)) +
  geom_point(alpha = 0.08, size = 0.08) +
  scale_y_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
  labs(x = "")


Searches that have higher volume internationally, tend to have higher difficulty

df_int_s %>% 
  ggplot(aes(x = volume_diff, y = difficulty)) +
  geom_point(alpha = 0.08, size = 0.08) +
  #scale_y_log10(labels = comma) +
  geom_smooth(method='lm', formula= y~x) +
  scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
  labs(x = "")


Clicks

In sample:

tribble(~Mean, ~Median,
        round(mean(df$clicks, na.rm = T), 2), median(df$clicks, na.rm = T)) %>% 
  pander()
Mean Median
3036 306

In representative sample:

tribble(~Mean, ~Median,
        round(mean(rdf$clicks, na.rm = T), 2), median(rdf$clicks, na.rm = T)) %>% 
  pander()
Mean Median
2150 261

By volume

tribble(~Mean, ~Median,
        round(weighted.mean(rdf$clicks, rdf$volume, na.rm = T), 2), weighted.median(rdf$clicks, rdf$volume, na.rm = T)) %>% 
  pander()
Mean Median
397516 20913
log_mean <- 10 ^ (df %>% mutate(clicks = clicks + 1) %>% 
  mutate(log_clicks = log10(clicks)) %>% 
  summarise(m = mean(log_clicks, na.rm = T)) %>% 
  pull(m))

df %>% ggplot(aes(x = clicks)) +
  geom_histogram(fill = "turquoise4", color = "black") +
  scale_x_log10(labels = comma) +
  scale_y_continuous(limits = c(0, 250000), expand = c(0,0)) +
  labs(title = "Distribution of number of clicks", y = "", x = "") +
  geom_vline(xintercept = log_mean, linetype = "dashed", color = "blue", size = 1) +
  ggeasy::easy_remove_y_axis()

Note that this is in sample, so the lowest part of the distribution is not included. Probably does not really make sense.


Return Rate

Comparison of searches with same volume but different return rates:

tbl <- bind_rows(
  df %>% filter(return_rate > 10) %>% 
    summarise(mean_cpc = mean(cpc, na.rm = T), mean_clicks = mean(clicks, na.rm = T), mean_cpc = mean(cps, na.rm = T), mean_difficulty = mean(difficulty, na.rm = T)) %>% 
    mutate(return_rate = "very high") %>% relocate(return_rate),

  df %>% filter(return_rate > 10) %>% 
    select(number, volume) %>% 
    left_join(df %>% filter(return_rate < 10), by = c("number", "volume")) %>% 
    distinct(number, volume, cat, .keep_all = T) %>% 
    summarise(mean_cpc = mean(cpc, na.rm = T), mean_clicks = mean(clicks, na.rm = T), mean_cpc = mean(cps, na.rm = T), mean_difficulty = mean(difficulty, na.rm = T)) %>% 
    mutate(return_rate = "low") %>% relocate(return_rate)
)

tbl %<>% mutate(mean_cpc = round(mean_cpc, 2), mean_clicks = round(mean_clicks, 0), mean_difficulty = round(mean_difficulty, 1))
               
tbl %>% write_csv("../plots/csv/return_rate.csv")
tbl %>% pander()
return_rate mean_cpc mean_clicks mean_difficulty
very high 0.96 71423 18.4
low 0.7 15094 25.6

We can see that searches with high return rates tend to have lower difficulty, and to be clicked on a lot more.